In [ ]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [ ]:
!pip install shap==0.43.0 umap-learn==0.5.4
Requirement already satisfied: shap==0.43.0 in /usr/local/lib/python3.10/dist-packages (0.43.0)
Requirement already satisfied: umap-learn==0.5.4 in /usr/local/lib/python3.10/dist-packages (0.5.4)
Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (1.23.5)
Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (1.11.3)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (1.2.2)
Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (1.5.3)
Requirement already satisfied: tqdm>=4.27.0 in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (4.66.1)
Requirement already satisfied: packaging>20.9 in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (23.2)
Requirement already satisfied: slicer==0.0.7 in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (0.0.7)
Requirement already satisfied: numba in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (0.56.4)
Requirement already satisfied: cloudpickle in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (2.2.1)
Requirement already satisfied: pynndescent>=0.5 in /usr/local/lib/python3.10/dist-packages (from umap-learn==0.5.4) (0.5.10)
Requirement already satisfied: tbb>=2019.0 in /usr/local/lib/python3.10/dist-packages (from umap-learn==0.5.4) (2021.10.0)
Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba->shap==0.43.0) (0.39.1)
Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from numba->shap==0.43.0) (67.7.2)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.10/dist-packages (from pynndescent>=0.5->umap-learn==0.5.4) (1.3.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->shap==0.43.0) (3.2.0)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->shap==0.43.0) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->shap==0.43.0) (2023.3.post1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->shap==0.43.0) (1.16.0)

Only tumor wo normal(healthy) preprocessing¶

In [ ]:
import shap
import pandas as pd
import numpy as np
df =  pd.read_pickle('/content/drive/MyDrive/pan_cancer_diner/Gene parsing  code /11093rx39979c.pkl')
meta= pd.read_pickle('/content/drive/MyDrive/pan_cancer_diner/Clinical data from  tcga.R/metadata.pkl')
In [ ]:
meta.columns
Out[ ]:
Index(['Unnamed: 0', 'data_type', 'updated_datetime', 'file_name', 'md5sum',
       'data_category', 'experimental_strategy', 'project', 'sample_uuid',
       'sample_barcode', 'tumor', 'patient_barcode', 'gender', 'vital',
       'days_to_contact', 'days_to_death', 'days_to_birth', 'panel',
       'histology', 'tissue_site', 'stage', 'T', 'N', 'M', 'residual_tumor',
       'new_tumor_events', 'follow_ups'],
      dtype='object')
In [ ]:
df
Out[ ]:
5S_rRNA 5_8S_rRNA 7SK A1BG A1BG-AS1 A1CF A2M A2M-AS1 A2ML1 A2ML1-AS1 ... ZYG11A ZYG11AP1 ZYG11B ZYX ZYXP1 ZZEF1 ZZZ3 hsa-mir-1253 hsa-mir-423 snoZ196
Unnamed: 0
84fd87d4-9b47-4852-b45c-1f681a58832c 4 0 949 48 74 2 27981 15 288 0 ... 1 0 1057 4318 0 973 924 0 0 1
2da36252-af74-4e0c-ae38-5f5fc6bdad6e 0 0 285 10 41 4 33182 80 586 0 ... 0 0 3719 5613 0 10839 1977 0 0 2
20ef7e72-9b76-4a4f-985c-1c35503b3e86 0 0 82 2 20 0 14847 115 122 0 ... 185 0 2622 6686 0 2412 2116 0 0 4
83acd71c-c12a-4394-ba8d-05e9c5ad2cf1 1 0 502 44 278 9 50129 139 26 1 ... 531 0 6497 9902 0 7520 6272 0 0 0
04d7ee0b-95ce-4af2-8140-547e9c6bd187 2 0 612 6 84 3 31641 55 518 0 ... 8 0 2446 11845 0 4063 2797 0 0 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5d98ad4a-483f-4032-ae9e-0ab6d398da98 3 0 3455 81 216 1 23 4 10 0 ... 2 0 907 28542 0 3221 1042 0 0 8
e0822da0-24d1-413a-b87f-5c6a242d3732 0 0 254 29 192 9 53226 74 38 1 ... 126 0 2741 11978 0 5163 5026 0 0 5
3a74e5d3-48e7-4df9-85e1-dd02d1c804c1 1 0 95 135 674 0 41441 54 12 0 ... 72 0 2296 27635 0 5424 2489 0 0 6
86fd69ec-fbe4-4ca7-a1a5-a42367527929 1 0 150 20 53 3 7448 21 12578 0 ... 356 0 854 7628 0 1908 1095 0 0 3
a811e4bd-d1d5-4a97-bc7f-899906c7f145 2 0 48 45 232 3 67745 22 19 0 ... 434 0 2194 8696 0 2316 1478 0 0 4

11093 rows × 39979 columns

Live is 0 Death is 1¶

In [ ]:
df = df[meta['tumor'] == True]
df['vital']= meta['vital']
df =df.dropna()
Boolean Series key will be reindexed to match DataFrame index.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
In [ ]:
df['vital'].unique()
Out[ ]:
array(['Alive', 'Dead'], dtype=object)
In [ ]:
y = df['vital'].replace({'Alive': 1, 'Dead': 0})
y
Out[ ]:
Unnamed: 0
84fd87d4-9b47-4852-b45c-1f681a58832c    1
2da36252-af74-4e0c-ae38-5f5fc6bdad6e    1
83acd71c-c12a-4394-ba8d-05e9c5ad2cf1    1
04d7ee0b-95ce-4af2-8140-547e9c6bd187    1
171f45e7-83e7-4286-970a-0dafa6f46d8a    1
                                       ..
21ea6561-2bd1-4a3d-ae70-3c504530bb26    0
5d98ad4a-483f-4032-ae9e-0ab6d398da98    0
3a74e5d3-48e7-4df9-85e1-dd02d1c804c1    1
86fd69ec-fbe4-4ca7-a1a5-a42367527929    0
a811e4bd-d1d5-4a97-bc7f-899906c7f145    1
Name: vital, Length: 10339, dtype: int64
In [ ]:
y.value_counts()
Out[ ]:
1    7251
0    3088
Name: vital, dtype: int64
In [ ]:

In [ ]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Load sample data
#Without the columun named "vital"
X = np.log(df.iloc[:,:-1].values +1)
y = df['vital'].replace({'Alive': 1, 'Dead': 0})
y = y.values

# Cross-validation setup
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
aucs = []

clf = XGBClassifier(
    n_estimators= 1000,
    learning_rate=0.01,
    max_depth=20,
    subsample=0.5,
    colsample_bytree=0.2,
    colsample_bylevel=0.2,
    gamma=2,
    reg_alpha=0.5,
    reg_lambda=2,
    eval_metric='auc',
    use_label_encoder=False,
    grow_policy='lossguide',
    verbosity=3,
    #tree_method='gpu_hist',
    n_jobs=-1,tree_method = "hist", device = "cuda"
)


"""
clf = XGBClassifier(
    n_estimators= 200,
    eval_metric='auc',
    verbosity=3,
    #tree_method='gpu_hist',
    random_state=50,
    grow_policy='lossguide',
    tree_method='gpu_hist',

)
"""
Out[ ]:
"\nclf = XGBClassifier(\n    n_estimators= 200,\n    eval_metric='auc',\n    verbosity=3,\n    #tree_method='gpu_hist',\n    random_state=50,\n    grow_policy='lossguide',\n    tree_method='gpu_hist',\n\n)\n"

StratifiedKFold is a variation of k-fold cross-validation that returns stratified folds. "Stratified" means that each fold is made by preserving the percentage of samples for each class. This is especially useful when you have an imbalanced dataset where one class significantly outnumbers the other(s).

In [ ]:

In [ ]:
# Create a figure for the ROC curves
plt.figure(figsize=(10, 8))
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)

    # Predict the class labels
    y_pred_class = clf.predict(X_test)

    # Print the classification report for this fold
    print(f"Classification Report for Fold {len(aucs) + 1}:")
    print(classification_report(y_test, y_pred_class))
    print("-" * 50)  # For visual separation between reports

    # Predict the probability of positive class
    y_pred = clf.predict_proba(X_test)[:, 1]

    # Compute and store AUC for the fold
    auc = roc_auc_score(y_test, y_pred)
    aucs.append(auc)

    # Compute ROC curve and plot
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    plt.plot(fpr, tpr, label=f'AUC (Fold {len(aucs)}): {auc:.2f}')

# Plotting the ROC curves
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for each fold')
plt.legend(loc="lower right")
plt.show()
plt.savefig("ROC.png", dpi=600)  # dpi can be adjusted based on desired resolution

# Reporting average and standard deviation of AUC
print(f"Mean AUC: {np.mean(aucs):.2f}")
print(f"Std AUC: {np.std(aucs):.2f}")
[20:08:39] ======== Monitor (0): HostSketchContainer ========
[20:08:39] AllReduce: 1.31617s, 1 calls @ 1316166us

[20:08:39] MakeCuts: 1.44378s, 1 calls @ 1443776us

[20:08:39] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:08:39] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[20:08:40] ======== Monitor (0):  ========
[20:08:40] InitCompressedData: 0.008849s, 1 calls @ 8849us

[20:14:48] ======== Monitor (0): Learner ========
[20:14:48] Configure: 0.000931s, 1 calls @ 931us

[20:14:48] EvalOneIter: 0.010252s, 1000 calls @ 10252us

[20:14:48] GetGradient: 0.091119s, 1000 calls @ 91119us

[20:14:48] PredictRaw: 0.001497s, 1000 calls @ 1497us

[20:14:48] UpdateOneIter: 368.323s, 1000 calls @ 368323197us

[20:14:48] ======== Monitor (0): GBTree ========
[20:14:48] BoostNewTrees: 368.212s, 1000 calls @ 368212163us

[20:14:48] CommitModel: 0.000947s, 1000 calls @ 947us

[20:14:48] ======== Device 0 Memory Allocations:  ========
[20:14:48] Peak memory usage: 11829MiB
[20:14:48] Number of allocations: 630133
[20:14:48] ======== Monitor (0): updater_gpu_hist ========
[20:14:48] InitData: 0.001516s, 1000 calls @ 1516us

[20:14:48] InitDataOnce: 0.001319s, 1 calls @ 1319us

[20:14:48] Update: 368.168s, 1000 calls @ 368168311us

[20:14:48] UpdatePredictionCache: 0.033227s, 1000 calls @ 33227us

[20:14:48] ======== Monitor (0): gradient_based_sampler ========
[20:14:48] Sample: 0.174361s, 1000 calls @ 174361us

[20:14:48] ======== Monitor (0): GPUHistMakerDevice0 ========
[20:14:48] AllReduce: 0.045497s, 129374 calls @ 45497us

[20:14:48] BuildHist: 1.00499s, 75124 calls @ 1004988us

[20:14:48] EvaluateSplits: 309.249s, 75124 calls @ 309249015us

[20:14:48] FinalisePosition: 0.059938s, 1000 calls @ 59938us

[20:14:48] InitRoot: 52.242s, 1000 calls @ 52242020us

[20:14:48] Reset: 1.66683s, 1000 calls @ 1666827us

[20:14:48] UpdatePosition: 3.65072s, 75124 calls @ 3650720us

[20:14:48] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:14:48] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[20:14:48] WARNING: /workspace/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.

This warning will only be shown once.

Classification Report for Fold 1:
              precision    recall  f1-score   support

           0       0.63      0.33      0.43       617
           1       0.76      0.92      0.83      1451

    accuracy                           0.74      2068
   macro avg       0.70      0.62      0.63      2068
weighted avg       0.72      0.74      0.71      2068

--------------------------------------------------
[20:15:09] ======== Monitor (0): HostSketchContainer ========
[20:15:09] AllReduce: 1.34397s, 1 calls @ 1343965us

[20:15:09] MakeCuts: 1.48077s, 1 calls @ 1480768us

[20:15:09] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:15:09] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[20:15:10] ======== Monitor (0):  ========
[20:15:10] InitCompressedData: 0.008957s, 1 calls @ 8957us

[20:21:23] ======== Monitor (0): Learner ========
[20:21:23] Configure: 0.000767s, 1 calls @ 767us

[20:21:23] EvalOneIter: 0.010911s, 1000 calls @ 10911us

[20:21:23] GetGradient: 0.092169s, 1000 calls @ 92169us

[20:21:23] PredictRaw: 0.001585s, 1000 calls @ 1585us

[20:21:23] UpdateOneIter: 373.295s, 1000 calls @ 373295404us

[20:21:23] ======== Monitor (0): GBTree ========
[20:21:23] BoostNewTrees: 373.182s, 1000 calls @ 373182309us

[20:21:23] CommitModel: 0.001027s, 1000 calls @ 1027us

[20:21:23] ======== Device 0 Memory Allocations:  ========
[20:21:23] Peak memory usage: 12769MiB
[20:21:23] Number of allocations: 1262846
[20:21:23] ======== Monitor (0): updater_gpu_hist ========
[20:21:23] InitData: 0.000863s, 1000 calls @ 863us

[20:21:23] InitDataOnce: 0.000676s, 1 calls @ 676us

[20:21:23] Update: 373.138s, 1000 calls @ 373138015us

[20:21:23] UpdatePredictionCache: 0.032849s, 1000 calls @ 32849us

[20:21:23] ======== Monitor (0): gradient_based_sampler ========
[20:21:23] Sample: 0.175867s, 1000 calls @ 175867us

[20:21:23] ======== Monitor (0): GPUHistMakerDevice0 ========
[20:21:23] AllReduce: 0.045384s, 131640 calls @ 45384us

[20:21:23] BuildHist: 1.00998s, 75644 calls @ 1009982us

[20:21:23] EvaluateSplits: 314.141s, 75644 calls @ 314141292us

[20:21:23] FinalisePosition: 0.063456s, 1000 calls @ 63456us

[20:21:23] InitRoot: 52.2914s, 1000 calls @ 52291443us

[20:21:23] Reset: 1.6703s, 1000 calls @ 1670298us

[20:21:23] UpdatePosition: 3.66174s, 75644 calls @ 3661742us

[20:21:23] ======== Monitor (0): Learner ========
[20:21:23] Configure: 0.000615s, 1 calls @ 615us

[20:21:23] ======== Monitor (0): GBTree ========
[20:21:23] ======== Device 0 Memory Allocations:  ========
[20:21:23] Peak memory usage: 12769MiB
[20:21:23] Number of allocations: 1262846
[20:21:23] ======== Monitor (0): updater_gpu_hist ========
[20:21:23] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:21:23] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
Classification Report for Fold 2:
              precision    recall  f1-score   support

           0       0.68      0.33      0.45       618
           1       0.77      0.93      0.84      1450

    accuracy                           0.75      2068
   macro avg       0.72      0.63      0.64      2068
weighted avg       0.74      0.75      0.72      2068

--------------------------------------------------
[20:21:45] ======== Monitor (0): HostSketchContainer ========
[20:21:45] AllReduce: 1.44328s, 1 calls @ 1443277us

[20:21:45] MakeCuts: 1.60879s, 1 calls @ 1608792us

[20:21:45] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:21:45] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[20:21:45] ======== Monitor (0):  ========
[20:21:45] InitCompressedData: 0.008925s, 1 calls @ 8925us

[20:27:54] ======== Monitor (0): Learner ========
[20:27:54] Configure: 0.001262s, 1 calls @ 1262us

[20:27:54] EvalOneIter: 0.010574s, 1000 calls @ 10574us

[20:27:54] GetGradient: 0.093528s, 1000 calls @ 93528us

[20:27:54] PredictRaw: 0.001559s, 1000 calls @ 1559us

[20:27:54] UpdateOneIter: 368.85s, 1000 calls @ 368849926us

[20:27:54] ======== Monitor (0): GBTree ========
[20:27:54] BoostNewTrees: 368.736s, 1000 calls @ 368735585us

[20:27:54] CommitModel: 0.000968s, 1000 calls @ 968us

[20:27:54] ======== Device 0 Memory Allocations:  ========
[20:27:54] Peak memory usage: 13694MiB
[20:27:54] Number of allocations: 1892995
[20:27:54] ======== Monitor (0): updater_gpu_hist ========
[20:27:54] InitData: 0.000861s, 1000 calls @ 861us

[20:27:54] InitDataOnce: 0.000675s, 1 calls @ 675us

[20:27:54] Update: 368.692s, 1000 calls @ 368691518us

[20:27:54] UpdatePredictionCache: 0.032845s, 1000 calls @ 32845us

[20:27:54] ======== Monitor (0): gradient_based_sampler ========
[20:27:54] Sample: 0.175475s, 1000 calls @ 175475us

[20:27:54] ======== Monitor (0): GPUHistMakerDevice0 ========
[20:27:54] AllReduce: 0.04477s, 129606 calls @ 44770us

[20:27:54] BuildHist: 1.00104s, 75322 calls @ 1001038us

[20:27:54] EvaluateSplits: 309.799s, 75322 calls @ 309798851us

[20:27:54] FinalisePosition: 0.064922s, 1000 calls @ 64922us

[20:27:54] InitRoot: 52.2193s, 1000 calls @ 52219262us

[20:27:54] Reset: 1.66512s, 1000 calls @ 1665122us

[20:27:54] UpdatePosition: 3.64545s, 75322 calls @ 3645452us

[20:27:54] ======== Monitor (0): Learner ========
[20:27:54] Configure: 0.000954s, 1 calls @ 954us

[20:27:54] ======== Monitor (0): GBTree ========
[20:27:54] ======== Device 0 Memory Allocations:  ========
[20:27:54] Peak memory usage: 13694MiB
[20:27:54] Number of allocations: 1892995
[20:27:54] ======== Monitor (0): updater_gpu_hist ========
[20:27:54] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:27:54] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
Classification Report for Fold 3:
              precision    recall  f1-score   support

           0       0.63      0.37      0.46       618
           1       0.77      0.91      0.83      1450

    accuracy                           0.75      2068
   macro avg       0.70      0.64      0.65      2068
weighted avg       0.73      0.75      0.72      2068

--------------------------------------------------
[20:28:13] ======== Monitor (0): HostSketchContainer ========
[20:28:13] AllReduce: 1.25285s, 1 calls @ 1252853us

[20:28:13] MakeCuts: 1.36541s, 1 calls @ 1365414us

[20:28:13] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:28:13] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[20:28:14] ======== Monitor (0):  ========
[20:28:14] InitCompressedData: 0.008855s, 1 calls @ 8855us

[20:34:20] ======== Monitor (0): Learner ========
[20:34:20] Configure: 0.001267s, 1 calls @ 1267us

[20:34:20] EvalOneIter: 0.010215s, 1000 calls @ 10215us

[20:34:20] GetGradient: 0.093642s, 1000 calls @ 93642us

[20:34:20] PredictRaw: 0.001559s, 1000 calls @ 1559us

[20:34:20] UpdateOneIter: 366.402s, 1000 calls @ 366401654us

[20:34:20] ======== Monitor (0): GBTree ========
[20:34:20] BoostNewTrees: 366.287s, 1000 calls @ 366286987us

[20:34:20] CommitModel: 0.000892s, 1000 calls @ 892us

[20:34:20] ======== Device 0 Memory Allocations:  ========
[20:34:20] Peak memory usage: 14620MiB
[20:34:20] Number of allocations: 2518290
[20:34:20] ======== Monitor (0): updater_gpu_hist ========
[20:34:20] InitData: 0.000799s, 1000 calls @ 799us

[20:34:20] InitDataOnce: 0.000628s, 1 calls @ 628us

[20:34:20] Update: 366.243s, 1000 calls @ 366243243us

[20:34:20] UpdatePredictionCache: 0.032925s, 1000 calls @ 32925us

[20:34:20] ======== Monitor (0): gradient_based_sampler ========
[20:34:20] Sample: 0.17309s, 1000 calls @ 173090us

[20:34:20] ======== Monitor (0): GPUHistMakerDevice0 ========
[20:34:20] AllReduce: 0.044932s, 128490 calls @ 44932us

[20:34:20] BuildHist: 0.998775s, 74739 calls @ 998775us

[20:34:20] EvaluateSplits: 307.39s, 74739 calls @ 307390007us

[20:34:20] FinalisePosition: 0.064006s, 1000 calls @ 64006us

[20:34:20] InitRoot: 52.2003s, 1000 calls @ 52200279us

[20:34:20] Reset: 1.65857s, 1000 calls @ 1658569us

[20:34:20] UpdatePosition: 3.63283s, 74739 calls @ 3632833us

[20:34:20] ======== Monitor (0): Learner ========
[20:34:20] Configure: 0.000771s, 1 calls @ 771us

[20:34:20] ======== Monitor (0): GBTree ========
[20:34:20] ======== Device 0 Memory Allocations:  ========
[20:34:20] Peak memory usage: 14620MiB
[20:34:20] Number of allocations: 2518290
[20:34:20] ======== Monitor (0): updater_gpu_hist ========
[20:34:20] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:34:20] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
Classification Report for Fold 4:
              precision    recall  f1-score   support

           0       0.65      0.37      0.47       618
           1       0.77      0.92      0.84      1450

    accuracy                           0.75      2068
   macro avg       0.71      0.64      0.65      2068
weighted avg       0.74      0.75      0.73      2068

--------------------------------------------------
[20:34:42] ======== Monitor (0): HostSketchContainer ========
[20:34:42] AllReduce: 1.20227s, 1 calls @ 1202273us

[20:34:42] MakeCuts: 1.33984s, 1 calls @ 1339839us

[20:34:42] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:34:42] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[20:34:42] ======== Monitor (0):  ========
[20:34:42] InitCompressedData: 0.008832s, 1 calls @ 8832us

[20:40:52] ======== Monitor (0): Learner ========
[20:40:52] Configure: 0.001375s, 1 calls @ 1375us

[20:40:52] EvalOneIter: 0.010712s, 1000 calls @ 10712us

[20:40:52] GetGradient: 0.092329s, 1000 calls @ 92329us

[20:40:52] PredictRaw: 0.001562s, 1000 calls @ 1562us

[20:40:52] UpdateOneIter: 370.296s, 1000 calls @ 370295978us

[20:40:52] ======== Monitor (0): GBTree ========
[20:40:52] BoostNewTrees: 370.182s, 1000 calls @ 370182110us

[20:40:52] CommitModel: 0.001003s, 1000 calls @ 1003us

[20:40:52] ======== Device 0 Memory Allocations:  ========
[20:40:52] Peak memory usage: 15553MiB
[20:40:52] Number of allocations: 3147564
[20:40:52] ======== Monitor (0): updater_gpu_hist ========
[20:40:52] InitData: 0.000786s, 1000 calls @ 786us

[20:40:52] InitDataOnce: 0.000619s, 1 calls @ 619us

[20:40:52] Update: 370.138s, 1000 calls @ 370137965us

[20:40:52] UpdatePredictionCache: 0.032805s, 1000 calls @ 32805us

[20:40:52] ======== Monitor (0): gradient_based_sampler ========
[20:40:52] Sample: 0.173104s, 1000 calls @ 173104us

[20:40:52] ======== Monitor (0): GPUHistMakerDevice0 ========
[20:40:52] AllReduce: 0.045781s, 130849 calls @ 45781us

[20:40:52] BuildHist: 1.01018s, 75235 calls @ 1010177us

[20:40:52] EvaluateSplits: 311.165s, 75235 calls @ 311164990us

[20:40:52] FinalisePosition: 0.066201s, 1000 calls @ 66201us

[20:40:52] InitRoot: 52.271s, 1000 calls @ 52270975us

[20:40:52] Reset: 1.66845s, 1000 calls @ 1668446us

[20:40:52] UpdatePosition: 3.65671s, 75235 calls @ 3656707us

[20:40:52] ======== Monitor (0): Learner ========
[20:40:52] Configure: 0.000789s, 1 calls @ 789us

[20:40:52] ======== Monitor (0): GBTree ========
[20:40:52] ======== Device 0 Memory Allocations:  ========
[20:40:52] Peak memory usage: 15553MiB
[20:40:52] Number of allocations: 3147564
[20:40:52] ======== Monitor (0): updater_gpu_hist ========
[20:40:52] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:40:52] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
Classification Report for Fold 5:
              precision    recall  f1-score   support

           0       0.67      0.32      0.43       617
           1       0.76      0.93      0.84      1450

    accuracy                           0.75      2067
   macro avg       0.72      0.63      0.64      2067
weighted avg       0.73      0.75      0.72      2067

--------------------------------------------------
Mean AUC: 0.78
Std AUC: 0.01
<Figure size 640x480 with 0 Axes>
In [ ]:
clf
Out[ ]:
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=0.2, colsample_bynode=None,
              colsample_bytree=0.2, device='cuda', early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', feature_types=None,
              gamma=2, grow_policy='lossguide', importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=20, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=1000, n_jobs=-1,
              num_parallel_tree=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=0.2, colsample_bynode=None,
              colsample_bytree=0.2, device='cuda', early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', feature_types=None,
              gamma=2, grow_policy='lossguide', importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=20, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=1000, n_jobs=-1,
              num_parallel_tree=None, random_state=None, ...)

WITH ALL DATA SHAPS

In [ ]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Load sample data
#Without the columun named "vital"
X = np.log(df.iloc[:,:-1].values +1)
y = df['vital'].replace({'Alive': 1, 'Dead': 0})
y = y.values
In [ ]:
"""
clf = XGBClassifier(
    n_estimators= 100,
    eval_metric='auc',
    verbosity=3,
    #tree_method='gpu_hist',
    random_state=50,
    grow_policy='lossguide',
    tree_method='gpu_hist',

)
"""
Out[ ]:
"\nclf = XGBClassifier(\n    n_estimators= 100,\n    eval_metric='auc',\n    verbosity=3,\n    #tree_method='gpu_hist',\n    random_state=50,\n    grow_policy='lossguide',\n    tree_method='gpu_hist',\n\n)\n"
In [ ]:
# Let's compute SHAP values for the last test set after the final cross-validation loop
explainer = shap.TreeExplainer(clf.fit(X,y))
[20:41:28] ======== Monitor (0): HostSketchContainer ========
[20:41:28] AllReduce: 1.49339s, 1 calls @ 1493388us

[20:41:28] MakeCuts: 1.62093s, 1 calls @ 1620927us

[20:41:28] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:41:28] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[20:41:28] ======== Monitor (0):  ========
[20:41:28] InitCompressedData: 0.011051s, 1 calls @ 11051us

[20:49:27] ======== Monitor (0): Learner ========
[20:49:27] Configure: 0.000785s, 1 calls @ 785us

[20:49:27] EvalOneIter: 0.011364s, 1000 calls @ 11364us

[20:49:27] GetGradient: 0.109481s, 1000 calls @ 109481us

[20:49:27] PredictRaw: 0.001669s, 1000 calls @ 1669us

[20:49:27] UpdateOneIter: 479.114s, 1000 calls @ 479114061us

[20:49:27] ======== Monitor (0): GBTree ========
[20:49:27] BoostNewTrees: 478.983s, 1000 calls @ 478983042us

[20:49:27] CommitModel: 0.001046s, 1000 calls @ 1046us

[20:49:27] ======== Device 0 Memory Allocations:  ========
[20:49:27] Peak memory usage: 16741MiB
[20:49:27] Number of allocations: 3926647
[20:49:27] ======== Monitor (0): updater_gpu_hist ========
[20:49:27] InitData: 0.000799s, 1000 calls @ 799us

[20:49:27] InitDataOnce: 0.000618s, 1 calls @ 618us

[20:49:27] Update: 478.936s, 1000 calls @ 478935552us

[20:49:27] UpdatePredictionCache: 0.03559s, 1000 calls @ 35590us

[20:49:27] ======== Monitor (0): gradient_based_sampler ========
[20:49:27] Sample: 0.211724s, 1000 calls @ 211724us

[20:49:27] ======== Monitor (0): GPUHistMakerDevice0 ========
[20:49:27] AllReduce: 0.056498s, 167249 calls @ 56498us

[20:49:27] BuildHist: 1.24579s, 93894 calls @ 1245793us

[20:49:27] EvaluateSplits: 409.731s, 93894 calls @ 409730747us

[20:49:27] FinalisePosition: 0.07003s, 1000 calls @ 70030us

[20:49:27] InitRoot: 61.2494s, 1000 calls @ 61249417us

[20:49:27] Reset: 1.72209s, 1000 calls @ 1722089us

[20:49:27] UpdatePosition: 4.54865s, 93894 calls @ 4548653us

[20:49:27] ======== Monitor (0): Learner ========
[20:49:27] Configure: 0.001535s, 1 calls @ 1535us

[20:49:27] ======== Monitor (0): GBTree ========
[20:49:27] ======== Device 0 Memory Allocations:  ========
[20:49:27] Peak memory usage: 16741MiB
[20:49:27] Number of allocations: 3926647
[20:49:27] ======== Monitor (0): updater_gpu_hist ========
[20:49:27] WARNING: /workspace/src/c_api/c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.
In [ ]:
shap_values = explainer.shap_values(X)
In [ ]:
shap.summary_plot(shap_values, df.iloc[:,:-1], plot_type="bar", max_display=100)
In [ ]:
import matplotlib
#matplotlib.use('Agg')

import shap
import matplotlib.pyplot as plt
import shap
import matplotlib.pyplot as plt

# Set up a figure
fig, ax = plt.subplots(figsize=(10, 6))

# Generate the SHAP plot
shap.summary_plot(shap_values, df.iloc[:,:-1], plot_type="bar", max_display=30)
plt.savefig("shap_summary_plot_global.png", dpi=600)  # dpi can be adjusted based on desired resolution

# Save and display
plt.tight_layout()
plt.show()
<Figure size 640x480 with 0 Axes>
In [ ]:
import numpy as np
import pandas as pd
import plotly.express as px

# Asumiendo que shap_values es tu matriz de valores SHAP y ya tienes un DataFrame de pandas df y meta definidos.
shap_df = pd.DataFrame(shap_values, columns=df.columns[:-1], index=df.index)

# Agregar la variable "panel" del DataFrame "meta" al DataFrame SHAP
shap_df["panel"] = meta['panel']

# Calcular valores SHAP absolutos para obtener importancias de características
shap_importances = np.abs(shap_df.iloc[:, :-1]).mean(axis=0)
top = 50

# Obtener los índices de las características principales basadas en sus importancias
top_feature_indices = np.argsort(shap_importances)[-top:]

# Seleccionar las características principales según sus índices
top_features = shap_df.columns[top_feature_indices]

# Derretir el DataFrame para la representación basada en tonos
melted_shap_df = pd.melt(shap_df, id_vars=["panel"], value_vars=top_features, var_name="variable")

# Ordena melted_shap_df por 'variable' y luego por 'panel' para que el orden de los plots coincida con el orden alfabético de las etiquetas
sorted_panels = sorted(melted_shap_df['panel'].unique())
melted_shap_df['panel'] = pd.Categorical(melted_shap_df['panel'], categories=sorted_panels, ordered=True)

# Ahora, ordena por variable y por el orden del panel
melted_shap_df = melted_shap_df.sort_values(by=['variable', 'panel'])

# Crear un diagrama de caja con Plotly
fig = px.box(
    melted_shap_df,
    x="value",
    y="variable",
    color="panel",
    title="Top 50 SHAP Values by Panel",
    labels={"value": "SHAP Value", "variable": "Feature", "panel": "Panel"},
    orientation="h",
    category_orders={"variable": top_features.tolist()}
)

# Agregar línea vertical en x=0
fig.add_shape(
    type='line',
    line=dict(
        dash='dash',
    ),
    x0=0,
    x1=0,
    y0=-0.5,
    y1=top - 0.5
)

fig.update_layout(
    xaxis_title="SHAP Value",
    yaxis_title="Feature",
    boxmode='group',
    margin=dict(l=0, r=0, t=30, b=0)
)

fig.show()

# Guardar la figura en un archivo HTML
fig.write_html("shap_summary_plotly.html")
In [ ]:
import matplotlib

import shap
import matplotlib.pyplot as plt
import shap
import matplotlib.pyplot as plt

# Set up a figure
fig, ax = plt.subplots(figsize=(10, 4))

# Generate the SHAP plot
shap.summary_plot(shap_values, np.log(df.iloc[:, :-1] +1), plot_type="layered_violin", max_display=30, color='seismic', show=False)
#seismic coolwarm
# Save and display
plt.tight_layout()
plt.savefig("shap_summary_plot_layered_seismic.png", dpi=600)  # dpi can be adjusted based on desired resolution

plt.show()
In [ ]:
import matplotlib

import shap
import matplotlib.pyplot as plt
import shap
import matplotlib.pyplot as plt
# Set style without gridlines
sns.set_style("white")
sns.set_context("talk")
# Set up a figure
fig, ax = plt.subplots(figsize=(10, 4))

# Generate the SHAP plot
shap.summary_plot(shap_values, np.log(df.iloc[:, :-1] +1), plot_type="dot", max_display=30, color='coolwarm', show=False)
#seismic coolwarm
# Save and display
plt.tight_layout()
plt.savefig("shap_summary_plot_dot.png", dpi=600)  # dpi can be adjusted based on desired resolution

plt.show()
In [ ]:
# Set style without gridlines
sns.set_style("white")
sns.set_context("talk")
import matplotlib

import shap
import matplotlib.pyplot as plt
import shap
import matplotlib.pyplot as plt

# Set up a figure
fig, ax = plt.subplots(figsize=(10, 4))

# Generate the SHAP plot
shap.summary_plot(shap_values, np.log(df.iloc[:, :-1] +1), plot_type="layered_violin", max_display=30, color='coolwarm', show=False)
#seismic coolwarm
# Save and display
plt.tight_layout()
plt.savefig("shap_summary_plot_layered_coolwarm.png", dpi=600)  # dpi can be adjusted based on desired resolution

plt.show()
In [ ]:
import matplotlib


import shap
import matplotlib.pyplot as plt
import shap
import matplotlib.pyplot as plt

# Set up a figure
fig, ax = plt.subplots(figsize=(10, 6))

# Generate the SHAP plot
shap.summary_plot(shap_values, df.iloc[:,:-1], plot_type="bar", max_display=30)

# Save and display
plt.tight_layout()
plt.show()
plt.savefig("shap_summary_plot_bar.png", dpi=600)  # dpi can be adjusted based on desired resolution
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
In [ ]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Set the style and context for the plot
sns.set_style("whitegrid")
sns.set_context("talk")

# Assuming shap_values_1 is your SHAP values matrix
# You can create a DataFrame to make it easier to work with
shap_df = pd.DataFrame(shap_values, columns=df.columns[:-1], index=df.index)

# Add the "panel" variable from the "meta" DataFrame to the SHAP DataFrame
shap_df["panel"] = meta['vital']

# Calculate absolute SHAP values to get feature importances
shap_importances = np.abs(shap_df.iloc[:, :-1]).mean(axis=0)
top = 30

# Get the indices of the top features based on their importances
top_feature_indices = np.argsort(shap_importances)[-top:]

# Select the top features based on their indices
top_features = shap_df.columns[top_feature_indices]

# Melt the DataFrame for hue-based plotting
melted_shap_df = pd.melt(shap_df, id_vars=["panel"], value_vars=top_features, var_name="variable")

# Set up the figure and axes
fig, ax = plt.subplots(figsize=(13, 17))

# Use sns.boxplot directly on the ax object with color referenced from your provided code
sns.boxplot(
    data=melted_shap_df, x="value", y="variable", hue="panel", ax=ax
)

# Add the red dotted line at x=0
ax.axvline(0, color='red', linestyle='--', linewidth=1.5)

# Invert the Y axis so the most important features are at the top
ax.invert_yaxis()

# Customize title and labels for better aesthetics
ax.set_xlabel("SHAP Value")
ax.set_ylabel("Feature")

# Add a legend and adjust its appearance
ax.legend(title="Panel", title_fontsize='16', loc="upper right", fontsize='14')

plt.tight_layout()

# Save and display the figure
plt.savefig("shap_vital_box.png", dpi=600, bbox_inches='tight')
plt.show()
In [ ]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Set the style and context for the plot
sns.set_style("whitegrid")
sns.set_context("talk")


# Suponiendo que shap_values_1 es tu matriz de valores SHAP
# Puedes crear un DataFrame para facilitar su manipulación
shap_df = pd.DataFrame(shap_values, columns=df.columns[:-1], index=df.index)

# Añadir la variable "panel" del DataFrame "meta" al DataFrame SHAP
shap_df["panel"] = meta['panel']

# Calcular valores SHAP absolutos para obtener importancias de características
shap_importances = np.abs(shap_df.iloc[:, :-1]).mean(axis=0)
top = 5
# Obtener los índices de las características principales según su importancia
top_feature_indices = np.argsort(shap_importances)[-top:]

# Seleccionar las características principales según sus índices
top_features = shap_df.columns[top_feature_indices]

# Crear un gráfico boxen más grande con colores consistentes identificando el panel
plt.figure(figsize=(10, 10))
melted_shap_df = pd.melt(shap_df, id_vars=["panel"], value_vars=top_features[::-1], var_name="variable")

sns.boxplot(
    data=melted_shap_df, x="value", y="variable", hue="panel",
    hue_order=sorted(melted_shap_df['panel'].unique()),
    orient="h", dodge=True, fliersize=1,linewidth=0.5
)

# Añadir la línea roja punteada en x=0
plt.axvline(0, color='red', linestyle='--')

# Establecer las etiquetas y el título con tamaños ajustados
plt.xlabel("SHAP Value", fontsize=12)
plt.title("SHAP Value Distribution across Panels", fontsize=14)

# Mover la leyenda fuera del área del gráfico, ajustar su tamaño y orientación
plt.legend(title="Panel", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10, title_fontsize=12)

# Ajuste apretado para garantizar una visualización adecuada del gráfico
plt.tight_layout()
plt.ylabel("")
# Guardar el gráfico con una resolución alta
plt.savefig("shap_panel_box.png", dpi=600, bbox_inches='tight')

# Mostrar el gráfico
plt.show()
In [ ]:
save = pd.DataFrame(shap_values, columns= df.iloc[:,:-1].columns, index= df.iloc[:,:-1].index)
In [ ]:
save
Out[ ]:
5S_rRNA 5_8S_rRNA 7SK A1BG A1BG-AS1 A1CF A2M A2M-AS1 A2ML1 A2ML1-AS1 ... ZYG11A ZYG11AP1 ZYG11B ZYX ZYXP1 ZZEF1 ZZZ3 hsa-mir-1253 hsa-mir-423 snoZ196
Unnamed: 0
84fd87d4-9b47-4852-b45c-1f681a58832c 0.000147 0.0 -0.000372 0.000310 -0.000094 0.0 0.000354 -0.000274 0.000009 0.0 ... -0.002201 -0.000012 0.000112 0.000273 0.0 -0.000010 0.000019 0.0 0.0 0.000007
2da36252-af74-4e0c-ae38-5f5fc6bdad6e -0.000152 0.0 -0.000408 -0.000406 -0.000036 0.0 0.000252 0.000093 0.000013 0.0 ... -0.001279 -0.000012 0.000081 0.000498 0.0 0.000003 0.000027 0.0 0.0 0.000002
83acd71c-c12a-4394-ba8d-05e9c5ad2cf1 0.000013 0.0 0.000458 -0.000659 0.000037 0.0 0.002019 0.000050 0.000310 0.0 ... 0.011648 -0.000012 -0.000687 -0.000226 0.0 0.000032 -0.000030 0.0 0.0 0.000106
04d7ee0b-95ce-4af2-8140-547e9c6bd187 0.000025 0.0 0.000032 0.002015 0.000038 0.0 0.000143 0.000069 0.000032 0.0 ... -0.002264 -0.000012 0.000068 0.000256 0.0 0.000005 -0.000047 0.0 0.0 -0.000020
171f45e7-83e7-4286-970a-0dafa6f46d8a 0.000008 0.0 0.000031 -0.000201 0.000030 0.0 0.001092 0.000050 -0.000505 0.0 ... 0.004665 -0.000008 0.000298 0.000020 0.0 0.000005 0.000138 0.0 0.0 0.000071
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
21ea6561-2bd1-4a3d-ae70-3c504530bb26 -0.000024 0.0 0.000319 -0.000267 0.000024 0.0 -0.000099 0.000040 0.000016 0.0 ... -0.011076 -0.000012 0.000296 -0.000018 0.0 0.000053 -0.000120 0.0 0.0 0.000049
5d98ad4a-483f-4032-ae9e-0ab6d398da98 0.000015 0.0 -0.000140 0.000473 0.000269 0.0 -0.000489 -0.000035 0.000009 0.0 ... -0.001635 -0.000012 0.000073 -0.005564 0.0 -0.000006 0.000065 0.0 0.0 -0.000003
3a74e5d3-48e7-4df9-85e1-dd02d1c804c1 0.000037 0.0 -0.000012 -0.000020 0.000047 0.0 0.000458 0.000104 0.000032 0.0 ... 0.004966 -0.000012 0.000061 -0.000318 0.0 0.000007 0.000163 0.0 0.0 0.000014
86fd69ec-fbe4-4ca7-a1a5-a42367527929 0.000020 0.0 -0.000929 -0.000467 -0.000225 0.0 0.002150 -0.000295 -0.000201 0.0 ... 0.014701 -0.000008 0.000099 -0.001627 0.0 -0.000006 0.000024 0.0 0.0 -0.000058
a811e4bd-d1d5-4a97-bc7f-899906c7f145 0.000018 0.0 -0.001686 -0.000204 0.000038 0.0 0.000431 -0.000116 0.000013 0.0 ... 0.010899 -0.000008 0.000088 0.000192 0.0 -0.000007 0.000021 0.0 0.0 -0.000014

10339 rows × 39979 columns

In [ ]:
save.to_pickle("/content/drive/MyDrive/pan_cancer_diner/XGBOOST all shap files/shap_1.pkl")
In [ ]: